# insurance <- read.table("BBDD_DEFINITIVA_V4.txt", header = T)
# saveRDS(insurance, "BBDD_DEFINITIVA_V4.RDS")

insurance <- readRDS("BBDD_DEFINITIVA_V4.RDS")
insurance$PERIODO           <- factor(insurance$PERIODO)
insurance$COVER             <- factor(insurance$COVER)
insurance$SEXO              <- factor(ifelse(insurance$SEXO == 0, "man", "woman"))
insurance$FUMADOR           <- factor(ifelse(insurance$FUMADOR == 0, "no", "yes"))
insurance$BUENA_SALUD       <- factor(ifelse(insurance$BUENA_SALUD == 1, "yes", "no"))
insurance$EXP               <- as.numeric(gsub(",", ".", gsub("\\.", "", insurance$EXP)))
insurance$SINIESTRO         <- factor(ifelse(insurance$SINIESTRO == 1, "yes", "no"))
insurance$DURACION          <- as.numeric(gsub(",",".", gsub("\\.", "", insurance$DURACION)))
insurance$CAPITAL_CAT1_DESC <- factor(insurance$CAPITAL_CAT1_DESC, 
                                      levels = c("0-30.000", "30.001-60.000", 
                                                 "60.001-90.000","90.001-120.000", 
                                                 "120.001-150.000", "+150.000"),
                                      ordered = T)

insurance$CAPITAL_CAT2_DESC <- factor(insurance$CAPITAL_CAT2_DESC, 
                                      levels = c("0-60.000", "60.001-120.000", 
                                                 "+120.000"),
                                      ordered = T)

insurance$CAPITAL_CAT3_DESC <- factor(insurance$CAPITAL_CAT3_DESC, 
                                      levels = c("0-100.000", "100.001-200.000", 
                                                 "+200.000"),
                                      ordered = T)

insurance$IMC               <- as.numeric(gsub(",", ".", gsub("\\.", "", insurance$IMC)))

insurance$IMC_CAT1_DESC     <- factor(insurance$IMC_CAT1_DESC, 
                                      levels = c("Normal", "Riesgo"),
                                      labels = c("normal", "risk"))

insurance$IMC_CAT2_DESC     <- factor(insurance$IMC_CAT2_DESC,
                                      levels = c("Normal", "Sobrepeso"),
                                      labels = c("normal", "overweight"))

Then, we rename the variables and remove some of them

insurance <- insurance %>%
  rename(key = CLAVE, period = PERIODO, cover = COVER, sex = SEXO,  smoker = FUMADOR,
         good_health = BUENA_SALUD, actuarial_age =  EDAD_ACTUARIAL, sus_age = EDAD_SUS,
         exp = EXP, accident = SINIESTRO , duration = DURACION, duration_cat = DURACION_CAT,
         capital = CAPITAL, capital_factor_1 = CAPITAL_CAT1_DESC, capital_factor_2 = CAPITAL_CAT2_DESC,
         capital_factor_3 = CAPITAL_CAT3_DESC, IMC_factor_1 = IMC_CAT1_DESC, IMC_factor_2 = IMC_CAT2_DESC)

insurance <- insurance %>%
  select(- c(CAPITAL_CAT1, CAPITAL_CAT2, CAPITAL_CAT3, IMC_CAT1, IMC_CAT2, IMC_CAT3))

Here, we remove the repeated columns

insurance <- tibble::rowid_to_column(insurance, "ID")

# here we take the IDs that are not repeated
IDs <- insurance %>%
  arrange(desc(accident)) %>%
  distinct_at(vars(key, period, cover), .keep_all = T) %>%
  select(ID)

# here we ensure that the rows we are removing have accident = 0
insurance %>%
  filter(!(ID %in% IDs$ID)) %>%
  select(accident) %>%
  summary()
##  accident
##  no :63  
##  yes: 0
insurance <- insurance %>%
  filter(ID %in% IDs$ID)

insurance

Also, we create another df of only the people that had some sinister

# accidentS
key_accident <- insurance %>% # tomamos todas las claves de las personas con accident
  filter(accident == "yes") %>%
  select(key)

insurance_accident <- insurance %>%
  filter(key %in% key_accident$key)

insurance_accident

And another one with the people that drop

# CAIDAS
# Here we take the keys of the caidas

keys_caidas <- insurance %>%
  filter((key %in% key_accident$key) == F) %>%
  group_by(key) %>%
  count() %>%
  mutate(CAIDA = ifelse(n == 1 | n == 2, "yes", "no")) %>%
  filter(CAIDA == "yes") %>%
  select(key)

insurance_caidas <- insurance %>%
  filter((key %in% key_accident$key) == F) %>%
  mutate(CAIDA = ifelse(key %in% keys_caidas$key, "yes", "no")) %>%
  arrange(key) 

for (i in 2:nrow(insurance_caidas)){
  if (insurance_caidas$CAIDA[i-1] == "yes") {
    if (insurance_caidas$key[i-1] == insurance_caidas$key[i]) {
      insurance_caidas$CAIDA[i-1] = "no"
    }
  }
}

insurance_caidas
# saveRDS(insurance, file = "insurance.RDS")
# saveRDS(insurance_accident, file = "insurance_accident.RDS")
# saveRDS(insurance_caidas, file = "insurance_caidas.RDS")

insurance <- readRDS("insurance.RDS")
insurance_accident <- readRDS("insurance_accident.RDS")
insurance_caidas <- readRDS("insurance_caidas.RDS")